{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>InvoiceNo</th>\n",
       "      <th>StockCode</th>\n",
       "      <th>Description</th>\n",
       "      <th>Quantity</th>\n",
       "      <th>InvoiceDate</th>\n",
       "      <th>UnitPrice</th>\n",
       "      <th>CustomerID</th>\n",
       "      <th>Country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>536365</td>\n",
       "      <td>85123A</td>\n",
       "      <td>WHITE HANGING HEART T-LIGHT HOLDER</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>2.55</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>536365</td>\n",
       "      <td>71053</td>\n",
       "      <td>WHITE METAL LANTERN</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>536365</td>\n",
       "      <td>84406B</td>\n",
       "      <td>CREAM CUPID HEARTS COAT HANGER</td>\n",
       "      <td>8</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>2.75</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>536365</td>\n",
       "      <td>84029G</td>\n",
       "      <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>536365</td>\n",
       "      <td>84029E</td>\n",
       "      <td>RED WOOLLY HOTTIE WHITE HEART.</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  InvoiceNo StockCode                          Description  Quantity  \\\n",
       "0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   \n",
       "1    536365     71053                  WHITE METAL LANTERN         6   \n",
       "2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   \n",
       "3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   \n",
       "4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   \n",
       "\n",
       "          InvoiceDate  UnitPrice  CustomerID         Country  \n",
       "0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  \n",
       "1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  \n",
       "2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  \n",
       "3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  \n",
       "4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  "
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "df = pd.read_excel(\"Online_Retail.xlsx\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It took a few monutes to load the data, so I keep a copy as a backup."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We need to do santity check about the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.Country.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',\n",
       "       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',\n",
       "       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',\n",
       "       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',\n",
       "       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',\n",
       "       'Lebanon', 'United Arab Emirates', 'Saudi Arabia', 'Czech Republic',\n",
       "       'Canada', 'Unspecified', 'Brazil', 'USA', 'European Community',\n",
       "       'Malta', 'RSA'], dtype=object)"
      ]
     },
     "execution_count": 188,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.Country.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>CustomerID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>3950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Germany</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>France</td>\n",
       "      <td>87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>Spain</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Belgium</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>Switzerland</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Portugal</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Italy</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Finland</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Austria</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Norway</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Netherlands</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Australia</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Channel Islands</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Denmark</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Cyprus</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Japan</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>Poland</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>USA</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Canada</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>Unspecified</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Israel</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Greece</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>EIRE</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Malta</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>United Arab Emirates</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Bahrain</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Lithuania</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Czech Republic</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Lebanon</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>RSA</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Saudi Arabia</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>Singapore</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Iceland</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>European Community</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Hong Kong</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 Country  CustomerID\n",
       "36        United Kingdom        3950\n",
       "14               Germany          95\n",
       "13                France          87\n",
       "31                 Spain          31\n",
       "3                Belgium          25\n",
       "33           Switzerland          21\n",
       "27              Portugal          19\n",
       "19                 Italy          15\n",
       "12               Finland          12\n",
       "1                Austria          11\n",
       "25                Norway          10\n",
       "24           Netherlands           9\n",
       "0              Australia           9\n",
       "6        Channel Islands           9\n",
       "9                Denmark           9\n",
       "7                 Cyprus           8\n",
       "32                Sweden           8\n",
       "20                 Japan           8\n",
       "26                Poland           6\n",
       "34                   USA           4\n",
       "5                 Canada           4\n",
       "37           Unspecified           4\n",
       "18                Israel           4\n",
       "15                Greece           4\n",
       "10                  EIRE           3\n",
       "23                 Malta           2\n",
       "35  United Arab Emirates           2\n",
       "2                Bahrain           2\n",
       "22             Lithuania           1\n",
       "8         Czech Republic           1\n",
       "21               Lebanon           1\n",
       "28                   RSA           1\n",
       "29          Saudi Arabia           1\n",
       "30             Singapore           1\n",
       "17               Iceland           1\n",
       "4                 Brazil           1\n",
       "11    European Community           1\n",
       "16             Hong Kong           0"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "customer_country=df1[['Country','CustomerID']].drop_duplicates()\n",
    "\n",
    "customer_country.groupby(['Country'])['CustomerID'].aggregate('count').reset_index().sort_values('CustomerID', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More than 90% of the customers in the data are from United Kingdom, There’s some research indicating that customer clusters vary by geography, so here I’ll restrict the data to United Kingdom only."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = df1.loc[df1['Country'] == 'United Kingdom']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check whether there are missing values in each column.\n",
    "\n",
    "There are 133600 missing values in CustomerID column, since our analysis is based on customers, we will remove these missing values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "InvoiceNo           0\n",
       "StockCode           0\n",
       "Description      1454\n",
       "Quantity            0\n",
       "InvoiceDate         0\n",
       "UnitPrice           0\n",
       "CustomerID     133600\n",
       "Country             0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 227,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.isnull().sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "InvoiceNo      0\n",
       "StockCode      0\n",
       "Description    0\n",
       "Quantity       0\n",
       "InvoiceDate    0\n",
       "UnitPrice      0\n",
       "CustomerID     0\n",
       "Country        0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df1[pd.notnull(df1['CustomerID'])]\n",
    "df1.isnull().sum(axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the min and max values in Unit price column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 229,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.UnitPrice.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-80995"
      ]
     },
     "execution_count": 230,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.Quantity.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove the negative values in Quantity column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 231,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df1[(df1['Quantity']>0)]\n",
    "df1.Quantity.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After cleaning up, we now dealing with 354345 rows and 8 columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(354345, 8)"
      ]
     },
     "execution_count": 232,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 354345 entries, 0 to 541893\n",
      "Data columns (total 8 columns):\n",
      "InvoiceNo      354345 non-null object\n",
      "StockCode      354345 non-null object\n",
      "Description    354345 non-null object\n",
      "Quantity       354345 non-null int64\n",
      "InvoiceDate    354345 non-null datetime64[ns]\n",
      "UnitPrice      354345 non-null float64\n",
      "CustomerID     354345 non-null float64\n",
      "Country        354345 non-null object\n",
      "dtypes: datetime64[ns](1), float64(2), int64(1), object(4)\n",
      "memory usage: 24.3+ MB\n"
     ]
    }
   ],
   "source": [
    "df1.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check unique value for each column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "InvoiceNo :  16649\n",
      "StockCode :  3645\n",
      "Description :  3844\n",
      "Quantity :  294\n",
      "InvoiceDate :  15615\n",
      "UnitPrice :  403\n",
      "CustomerID :  3921\n",
      "Country :  1\n"
     ]
    }
   ],
   "source": [
    "def unique_counts(df1):\n",
    "   for i in df1.columns:\n",
    "       count = df1[i].nunique()\n",
    "       print(i, \": \", count)\n",
    "unique_counts(df1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>InvoiceNo</th>\n",
       "      <th>StockCode</th>\n",
       "      <th>Description</th>\n",
       "      <th>Quantity</th>\n",
       "      <th>InvoiceDate</th>\n",
       "      <th>UnitPrice</th>\n",
       "      <th>CustomerID</th>\n",
       "      <th>Country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>536365</td>\n",
       "      <td>85123A</td>\n",
       "      <td>WHITE HANGING HEART T-LIGHT HOLDER</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>2.55</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>536365</td>\n",
       "      <td>71053</td>\n",
       "      <td>WHITE METAL LANTERN</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>536365</td>\n",
       "      <td>84406B</td>\n",
       "      <td>CREAM CUPID HEARTS COAT HANGER</td>\n",
       "      <td>8</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>2.75</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>536365</td>\n",
       "      <td>84029G</td>\n",
       "      <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>536365</td>\n",
       "      <td>84029E</td>\n",
       "      <td>RED WOOLLY HOTTIE WHITE HEART.</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  InvoiceNo StockCode                          Description  Quantity  \\\n",
       "0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   \n",
       "1    536365     71053                  WHITE METAL LANTERN         6   \n",
       "2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   \n",
       "3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   \n",
       "4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   \n",
       "\n",
       "          InvoiceDate  UnitPrice  CustomerID         Country  \n",
       "0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  \n",
       "1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  \n",
       "2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  \n",
       "3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  \n",
       "4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  "
      ]
     },
     "execution_count": 235,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add a column for total price"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>InvoiceNo</th>\n",
       "      <th>StockCode</th>\n",
       "      <th>Description</th>\n",
       "      <th>Quantity</th>\n",
       "      <th>InvoiceDate</th>\n",
       "      <th>UnitPrice</th>\n",
       "      <th>CustomerID</th>\n",
       "      <th>Country</th>\n",
       "      <th>TotalPrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>536365</td>\n",
       "      <td>85123A</td>\n",
       "      <td>WHITE HANGING HEART T-LIGHT HOLDER</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>2.55</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>15.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>536365</td>\n",
       "      <td>71053</td>\n",
       "      <td>WHITE METAL LANTERN</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>20.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>536365</td>\n",
       "      <td>84406B</td>\n",
       "      <td>CREAM CUPID HEARTS COAT HANGER</td>\n",
       "      <td>8</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>2.75</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>22.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>536365</td>\n",
       "      <td>84029G</td>\n",
       "      <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>20.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>536365</td>\n",
       "      <td>84029E</td>\n",
       "      <td>RED WOOLLY HOTTIE WHITE HEART.</td>\n",
       "      <td>6</td>\n",
       "      <td>2010-12-01 08:26:00</td>\n",
       "      <td>3.39</td>\n",
       "      <td>17850.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>20.34</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  InvoiceNo StockCode                          Description  Quantity  \\\n",
       "0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   \n",
       "1    536365     71053                  WHITE METAL LANTERN         6   \n",
       "2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   \n",
       "3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   \n",
       "4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   \n",
       "\n",
       "          InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \n",
       "0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom       15.30  \n",
       "1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom       20.34  \n",
       "2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom       22.00  \n",
       "3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom       20.34  \n",
       "4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom       20.34  "
      ]
     },
     "execution_count": 236,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1['TotalPrice'] = df1['Quantity'] * df1['UnitPrice']\n",
    "df1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find out first and last order date in the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2010-12-01 08:26:00')"
      ]
     },
     "execution_count": 237,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1['InvoiceDate'].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2011-12-09 12:49:00')"
      ]
     },
     "execution_count": 238,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1['InvoiceDate'].max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since recency is calculated for a point in time. The last invoice date is 2011-12-09, this is the date we will use to calculate recency."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datetime as dt\n",
    "NOW = dt.datetime(2011,12,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df1['InvoiceDate'] = pd.to_datetime(df1['InvoiceDate'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a RFM table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rfmTable = df1.groupby('CustomerID').agg({'InvoiceDate': lambda x: (NOW - x.max()).days, # Recency\n",
    "                                        'InvoiceNo': lambda x: len(x),      # Frequency\n",
    "                                        'TotalPrice': lambda x: x.sum()}) # Monetary Value\n",
    "\n",
    "rfmTable['InvoiceDate'] = rfmTable['InvoiceDate'].astype(int)\n",
    "rfmTable.rename(columns={'InvoiceDate': 'recency', \n",
    "                         'InvoiceNo': 'frequency', \n",
    "                         'TotalPrice': 'monetary_value'}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate RFM metrics for each customer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recency</th>\n",
       "      <th>frequency</th>\n",
       "      <th>monetary_value</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CustomerID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12346.0</th>\n",
       "      <td>325</td>\n",
       "      <td>1</td>\n",
       "      <td>77183.60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12747.0</th>\n",
       "      <td>2</td>\n",
       "      <td>103</td>\n",
       "      <td>4196.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12748.0</th>\n",
       "      <td>0</td>\n",
       "      <td>4596</td>\n",
       "      <td>33719.73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12749.0</th>\n",
       "      <td>3</td>\n",
       "      <td>199</td>\n",
       "      <td>4090.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12820.0</th>\n",
       "      <td>3</td>\n",
       "      <td>59</td>\n",
       "      <td>942.34</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            recency  frequency  monetary_value\n",
       "CustomerID                                    \n",
       "12346.0         325          1        77183.60\n",
       "12747.0           2        103         4196.01\n",
       "12748.0           0       4596        33719.73\n",
       "12749.0           3        199         4090.88\n",
       "12820.0           3         59          942.34"
      ]
     },
     "execution_count": 242,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rfmTable.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Interpretation:\n",
    "\n",
    "CustomerID 12346 has frequency:1, monetary value:$77183.60 and recency:324 days.\n",
    "\n",
    "CustomerID 12747 has frequency: 103, monetary value: $4196.01 and recency: 1 day\n",
    "\n",
    "Let's check the details of the first customer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>InvoiceNo</th>\n",
       "      <th>StockCode</th>\n",
       "      <th>Description</th>\n",
       "      <th>Quantity</th>\n",
       "      <th>InvoiceDate</th>\n",
       "      <th>UnitPrice</th>\n",
       "      <th>CustomerID</th>\n",
       "      <th>Country</th>\n",
       "      <th>TotalPrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>61619</th>\n",
       "      <td>541431</td>\n",
       "      <td>23166</td>\n",
       "      <td>MEDIUM CERAMIC TOP STORAGE JAR</td>\n",
       "      <td>74215</td>\n",
       "      <td>2011-01-18 10:01:00</td>\n",
       "      <td>1.04</td>\n",
       "      <td>12346.0</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>77183.6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      InvoiceNo StockCode                     Description  Quantity  \\\n",
       "61619    541431     23166  MEDIUM CERAMIC TOP STORAGE JAR     74215   \n",
       "\n",
       "              InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \n",
       "61619 2011-01-18 10:01:00       1.04     12346.0  United Kingdom     77183.6  "
      ]
     },
     "execution_count": 243,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_customer = df1[df1['CustomerID']== 12346.0]\n",
    "first_customer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first customer has shopped only once, bought one item at a huge quantity(74215). The unit price is very low, seems a clearance sale."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 244,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(NOW - dt.datetime(2011,1,18)).days==326"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The easies way to split metrics into segments is by using quartile. \n",
    "\n",
    "1. This gives us a starting point for detailed analysis\n",
    "2. 4 segments are easy to understand and explain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recency</th>\n",
       "      <th>frequency</th>\n",
       "      <th>monetary_value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0.25</th>\n",
       "      <td>17.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>300.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0.50</th>\n",
       "      <td>50.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>651.82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0.75</th>\n",
       "      <td>142.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>1575.89</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      recency  frequency  monetary_value\n",
       "0.25     17.0       17.0          300.04\n",
       "0.50     50.0       41.0          651.82\n",
       "0.75    142.0       99.0         1575.89"
      ]
     },
     "execution_count": 245,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quantiles = rfmTable.quantile(q=[0.25,0.5,0.75])\n",
    "quantiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'frequency': {0.25: 17.0, 0.5: 41.0, 0.75: 99.0},\n",
       " 'monetary_value': {0.25: 300.03999999999996,\n",
       "  0.5: 651.82000000000016,\n",
       "  0.75: 1575.8900000000003},\n",
       " 'recency': {0.25: 17.0, 0.5: 50.0, 0.75: 142.0}}"
      ]
     },
     "execution_count": 246,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quantiles = quantiles.to_dict()\n",
    "quantiles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a segmented RFM table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "segmented_rfm = rfmTable"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lowest recency, highest frequency and monetary are our best customers "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def RScore(x,p,d):\n",
    "    if x <= d[p][0.25]:\n",
    "        return 1\n",
    "    elif x <= d[p][0.50]:\n",
    "        return 2\n",
    "    elif x <= d[p][0.75]: \n",
    "        return 3\n",
    "    else:\n",
    "        return 4\n",
    "    \n",
    "def FMScore(x,p,d):\n",
    "    if x <= d[p][0.25]:\n",
    "        return 4\n",
    "    elif x <= d[p][0.50]:\n",
    "        return 3\n",
    "    elif x <= d[p][0.75]: \n",
    "        return 2\n",
    "    else:\n",
    "        return 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "segmented_rfm['r_quartile'] = segmented_rfm['recency'].apply(RScore, args=('recency',quantiles,))\n",
    "segmented_rfm['f_quartile'] = segmented_rfm['frequency'].apply(FMScore, args=('frequency',quantiles,))\n",
    "segmented_rfm['m_quartile'] = segmented_rfm['monetary_value'].apply(FMScore, args=('monetary_value',quantiles,))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add segment numbers to the RFM table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recency</th>\n",
       "      <th>frequency</th>\n",
       "      <th>monetary_value</th>\n",
       "      <th>r_quartile</th>\n",
       "      <th>f_quartile</th>\n",
       "      <th>m_quartile</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CustomerID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12346.0</th>\n",
       "      <td>325</td>\n",
       "      <td>1</td>\n",
       "      <td>77183.60</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12747.0</th>\n",
       "      <td>2</td>\n",
       "      <td>103</td>\n",
       "      <td>4196.01</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12748.0</th>\n",
       "      <td>0</td>\n",
       "      <td>4596</td>\n",
       "      <td>33719.73</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12749.0</th>\n",
       "      <td>3</td>\n",
       "      <td>199</td>\n",
       "      <td>4090.88</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12820.0</th>\n",
       "      <td>3</td>\n",
       "      <td>59</td>\n",
       "      <td>942.34</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            recency  frequency  monetary_value  r_quartile  f_quartile  \\\n",
       "CustomerID                                                               \n",
       "12346.0         325          1        77183.60           4           4   \n",
       "12747.0           2        103         4196.01           1           1   \n",
       "12748.0           0       4596        33719.73           1           1   \n",
       "12749.0           3        199         4090.88           1           1   \n",
       "12820.0           3         59          942.34           1           2   \n",
       "\n",
       "            m_quartile  \n",
       "CustomerID              \n",
       "12346.0              1  \n",
       "12747.0              1  \n",
       "12748.0              1  \n",
       "12749.0              1  \n",
       "12820.0              2  "
      ]
     },
     "execution_count": 250,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "segmented_rfm.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "RFM segments split your customer base into an imaginary 3D cube. It is hard to visualize. However, we can sort it out.\n",
    "\n",
    "Add a new column to combine RFM score, 111 is the highest score as we determined earlier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recency</th>\n",
       "      <th>frequency</th>\n",
       "      <th>monetary_value</th>\n",
       "      <th>r_quartile</th>\n",
       "      <th>f_quartile</th>\n",
       "      <th>m_quartile</th>\n",
       "      <th>RFMScore</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CustomerID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12346.0</th>\n",
       "      <td>325</td>\n",
       "      <td>1</td>\n",
       "      <td>77183.60</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>441</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12747.0</th>\n",
       "      <td>2</td>\n",
       "      <td>103</td>\n",
       "      <td>4196.01</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12748.0</th>\n",
       "      <td>0</td>\n",
       "      <td>4596</td>\n",
       "      <td>33719.73</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12749.0</th>\n",
       "      <td>3</td>\n",
       "      <td>199</td>\n",
       "      <td>4090.88</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12820.0</th>\n",
       "      <td>3</td>\n",
       "      <td>59</td>\n",
       "      <td>942.34</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            recency  frequency  monetary_value  r_quartile  f_quartile  \\\n",
       "CustomerID                                                               \n",
       "12346.0         325          1        77183.60           4           4   \n",
       "12747.0           2        103         4196.01           1           1   \n",
       "12748.0           0       4596        33719.73           1           1   \n",
       "12749.0           3        199         4090.88           1           1   \n",
       "12820.0           3         59          942.34           1           2   \n",
       "\n",
       "            m_quartile RFMScore  \n",
       "CustomerID                       \n",
       "12346.0              1      441  \n",
       "12747.0              1      111  \n",
       "12748.0              1      111  \n",
       "12749.0              1      111  \n",
       "12820.0              2      122  "
      ]
     },
     "execution_count": 222,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "segmented_rfm['RFMScore'] = segmented_rfm.r_quartile.map(str) \\\n",
    "                            + segmented_rfm.f_quartile.map(str) \\\n",
    "                            + segmented_rfm.m_quartile.map(str)\n",
    "segmented_rfm.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apparently, the first customer is not our best customer at all."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is top 10 of our best customers!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recency</th>\n",
       "      <th>frequency</th>\n",
       "      <th>monetary_value</th>\n",
       "      <th>r_quartile</th>\n",
       "      <th>f_quartile</th>\n",
       "      <th>m_quartile</th>\n",
       "      <th>RFMScore</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CustomerID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18102.0</th>\n",
       "      <td>0</td>\n",
       "      <td>431</td>\n",
       "      <td>259657.30</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17450.0</th>\n",
       "      <td>8</td>\n",
       "      <td>337</td>\n",
       "      <td>194550.79</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17511.0</th>\n",
       "      <td>2</td>\n",
       "      <td>963</td>\n",
       "      <td>91062.38</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16684.0</th>\n",
       "      <td>4</td>\n",
       "      <td>277</td>\n",
       "      <td>66653.56</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14096.0</th>\n",
       "      <td>4</td>\n",
       "      <td>5111</td>\n",
       "      <td>65164.79</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13694.0</th>\n",
       "      <td>3</td>\n",
       "      <td>568</td>\n",
       "      <td>65039.62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15311.0</th>\n",
       "      <td>0</td>\n",
       "      <td>2379</td>\n",
       "      <td>60767.90</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13089.0</th>\n",
       "      <td>2</td>\n",
       "      <td>1818</td>\n",
       "      <td>58825.83</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15769.0</th>\n",
       "      <td>7</td>\n",
       "      <td>130</td>\n",
       "      <td>56252.72</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15061.0</th>\n",
       "      <td>3</td>\n",
       "      <td>403</td>\n",
       "      <td>54534.14</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            recency  frequency  monetary_value  r_quartile  f_quartile  \\\n",
       "CustomerID                                                               \n",
       "18102.0           0        431       259657.30           1           1   \n",
       "17450.0           8        337       194550.79           1           1   \n",
       "17511.0           2        963        91062.38           1           1   \n",
       "16684.0           4        277        66653.56           1           1   \n",
       "14096.0           4       5111        65164.79           1           1   \n",
       "13694.0           3        568        65039.62           1           1   \n",
       "15311.0           0       2379        60767.90           1           1   \n",
       "13089.0           2       1818        58825.83           1           1   \n",
       "15769.0           7        130        56252.72           1           1   \n",
       "15061.0           3        403        54534.14           1           1   \n",
       "\n",
       "            m_quartile RFMScore  \n",
       "CustomerID                       \n",
       "18102.0              1      111  \n",
       "17450.0              1      111  \n",
       "17511.0              1      111  \n",
       "16684.0              1      111  \n",
       "14096.0              1      111  \n",
       "13694.0              1      111  \n",
       "15311.0              1      111  \n",
       "13089.0              1      111  \n",
       "15769.0              1      111  \n",
       "15061.0              1      111  "
      ]
     },
     "execution_count": 218,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "segmented_rfm[segmented_rfm['RFMScore']=='111'].sort_values('monetary_value', ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Learn more\n",
    "\n",
    "[Buy ’Til You Die - A Walkthrough](https://cran.r-project.org/web/packages/BTYD/vignettes/BTYD-walkthrough.pdf)\n",
    "\n",
    "[Customer Segmentation for a wine seller](http://blog.yhat.com/posts/customer-segmentation-using-python.html)\n",
    "\n",
    "If you are interested in implenting RFM in R, [Kimberly Coffey](http://www.kimberlycoffey.com/blog/2016/8/k-means-clustering-for-customer-segmentation) has a [great tutorial](http://www.kimberlycoffey.com/blog/2016/8/k-means-clustering-for-customer-segmentation) on the same dataset."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "reference: [Blast Analytics and Marketing](http://www.blastam.com/blog/rfm-analysis-boosts-sales)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
